Chapter 4

In this exercise we use Boston data from MASS-library. This dataset contains information collected by the U.S Census Service concerning housing in the area of Boston Mass. Data includes 14 variables and 506 rows

# access the MASS package and load other libraries for later analysis
library(MASS)
library(corrplot)
## corrplot 0.84 loaded
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# load the data
 data("Boston")
str(Boston)
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
dim(Boston)
## [1] 506  14
#Show a graphical overview of the data and show summaries of the variables in the data. Describe and interpret the outputs, commenting on the distributions of the variables and the relationships between them
summary(Boston)
##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          black       
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv      
##  Min.   : 1.73   Min.   : 5.00  
##  1st Qu.: 6.95   1st Qu.:17.02  
##  Median :11.36   Median :21.20  
##  Mean   :12.65   Mean   :22.53  
##  3rd Qu.:16.95   3rd Qu.:25.00  
##  Max.   :37.97   Max.   :50.00
pairs(Boston)

There are some very interesting distributions fo variables in the plot matrix. Variable rad has high and low values so the plot shows that the values are concentrated on either side of the plot.

#Correlation matrix#

#calculating the correlation matrix and correlation plot
cor_matrix <- round(cor(Boston),digits = 2)
corrplot(cor_matrix, method="circle", type = "upper", cl.pos = "b", tl.pos = "d", tl.cex = 0.6)

Plotted correlation matrix shows that there is some high correlation between variables: * Correlation is quite clear between industrial areas (indus) and nitrogen oxides (nox). Industry adds pollution in the area. Industry seems to correlate also with variablrs like age, dis, ras and tax. * Nitrogen oxides (nox) correlations are very similar with industry (indus) * Crime rate (crim) seems to correlate with good accessibilitty to radial highways (rad) and value property (tax). * Old houses (age) and employment centers have also something common

#Scaled data# All the variables are numerical so we can use scale()-function to scale whole data set

#Standardize the dataset and print out summaries of the scaled data. How did the variables change? 
boston_scaled <- scale(Boston)
summary(boston_scaled)
##       crim                 zn               indus              chas        
##  Min.   :-0.419367   Min.   :-0.48724   Min.   :-1.5563   Min.   :-0.2723  
##  1st Qu.:-0.410563   1st Qu.:-0.48724   1st Qu.:-0.8668   1st Qu.:-0.2723  
##  Median :-0.390280   Median :-0.48724   Median :-0.2109   Median :-0.2723  
##  Mean   : 0.000000   Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.007389   3rd Qu.: 0.04872   3rd Qu.: 1.0150   3rd Qu.:-0.2723  
##  Max.   : 9.924110   Max.   : 3.80047   Max.   : 2.4202   Max.   : 3.6648  
##       nox                rm               age               dis         
##  Min.   :-1.4644   Min.   :-3.8764   Min.   :-2.3331   Min.   :-1.2658  
##  1st Qu.:-0.9121   1st Qu.:-0.5681   1st Qu.:-0.8366   1st Qu.:-0.8049  
##  Median :-0.1441   Median :-0.1084   Median : 0.3171   Median :-0.2790  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.5981   3rd Qu.: 0.4823   3rd Qu.: 0.9059   3rd Qu.: 0.6617  
##  Max.   : 2.7296   Max.   : 3.5515   Max.   : 1.1164   Max.   : 3.9566  
##       rad               tax             ptratio            black        
##  Min.   :-0.9819   Min.   :-1.3127   Min.   :-2.7047   Min.   :-3.9033  
##  1st Qu.:-0.6373   1st Qu.:-0.7668   1st Qu.:-0.4876   1st Qu.: 0.2049  
##  Median :-0.5225   Median :-0.4642   Median : 0.2746   Median : 0.3808  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 1.6596   3rd Qu.: 1.5294   3rd Qu.: 0.8058   3rd Qu.: 0.4332  
##  Max.   : 1.6596   Max.   : 1.7964   Max.   : 1.6372   Max.   : 0.4406  
##      lstat              medv        
##  Min.   :-1.5296   Min.   :-1.9063  
##  1st Qu.:-0.7986   1st Qu.:-0.5989  
##  Median :-0.1811   Median :-0.1449  
##  Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.6024   3rd Qu.: 0.2683  
##  Max.   : 3.5453   Max.   : 2.9865
class(boston_scaled)
## [1] "matrix" "array"
boston_scaled <- as.data.frame(boston_scaled)

Scaling the data makes variables look as if they are in the same range. Variables like black and tax were before scaling hundred fold compared to some other variables

#Create a categorical variable of the crime rate in the Boston dataset (from the scaled crime rate). Use the quantiles as the break points in the categorical variable. Drop the old crime rate variable from the dataset. 
#save the scaled crim as scaled_crim
scaled_crim <- boston_scaled$crim
#create a quantile vector of crim and print it
bins <- quantile(scaled_crim)
bins
##           0%          25%          50%          75%         100% 
## -0.419366929 -0.410563278 -0.390280295  0.007389247  9.924109610
#create a categorical variable 'crime'
crime <- cut(scaled_crim, breaks = bins, include.lowest = TRUE, label = c("low", "med_low", "med_high", "high"))
#look at the table of the new factor crime, do not change the actual variable "crime"
crime_tab <-table(crime)
crime_tab
## crime
##      low  med_low med_high     high 
##      127      126      126      127
#remove original crim from the dataset
boston_scaled <- dplyr::select(boston_scaled, -crim)
# add the new categorical value to scaled data
boston_scaled <- data.frame(boston_scaled, crime)
summary(boston_scaled)
##        zn               indus              chas              nox         
##  Min.   :-0.48724   Min.   :-1.5563   Min.   :-0.2723   Min.   :-1.4644  
##  1st Qu.:-0.48724   1st Qu.:-0.8668   1st Qu.:-0.2723   1st Qu.:-0.9121  
##  Median :-0.48724   Median :-0.2109   Median :-0.2723   Median :-0.1441  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.04872   3rd Qu.: 1.0150   3rd Qu.:-0.2723   3rd Qu.: 0.5981  
##  Max.   : 3.80047   Max.   : 2.4202   Max.   : 3.6648   Max.   : 2.7296  
##        rm               age               dis               rad         
##  Min.   :-3.8764   Min.   :-2.3331   Min.   :-1.2658   Min.   :-0.9819  
##  1st Qu.:-0.5681   1st Qu.:-0.8366   1st Qu.:-0.8049   1st Qu.:-0.6373  
##  Median :-0.1084   Median : 0.3171   Median :-0.2790   Median :-0.5225  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.4823   3rd Qu.: 0.9059   3rd Qu.: 0.6617   3rd Qu.: 1.6596  
##  Max.   : 3.5515   Max.   : 1.1164   Max.   : 3.9566   Max.   : 1.6596  
##       tax             ptratio            black             lstat        
##  Min.   :-1.3127   Min.   :-2.7047   Min.   :-3.9033   Min.   :-1.5296  
##  1st Qu.:-0.7668   1st Qu.:-0.4876   1st Qu.: 0.2049   1st Qu.:-0.7986  
##  Median :-0.4642   Median : 0.2746   Median : 0.3808   Median :-0.1811  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 1.5294   3rd Qu.: 0.8058   3rd Qu.: 0.4332   3rd Qu.: 0.6024  
##  Max.   : 1.7964   Max.   : 1.6372   Max.   : 0.4406   Max.   : 3.5453  
##       medv              crime    
##  Min.   :-1.9063   low     :127  
##  1st Qu.:-0.5989   med_low :126  
##  Median :-0.1449   med_high:126  
##  Mean   : 0.0000   high    :127  
##  3rd Qu.: 0.2683                 
##  Max.   : 2.9865

#Train and test set# Training set contains 80% of the data. 20% is in the test set

#Divide the dataset to train and test sets, so that 80% of the data belongs to the train set
# number of rows in the Boston dataset 
n <- nrow(boston_scaled)
# choose randomly 80% of the rows
ind <- sample(n,  size = n * 0.8)
ind
##   [1] 279 366 242 108 425 170 370 368 105 292 321  29  30 164  72 167 313 405
##  [19] 277 357 160 206 177 298 247 402  25  71  90 311  21 457 204 183  28 208
##  [37] 291 445 122  74 243 295 225 325  59 334 408  55  65 450 148 506 462  41
##  [55] 227  77 443 407 274 138 388  47 259 193 360 174 387 271 171 280  78  48
##  [73]  32 269   3  69 301 282 406 135 196 142 209 107 246  99 503 273 483 353
##  [91] 261 487 110 422 300 113 106 335  37 336 251 442   8 500  87 320  92 166
## [109]  44 144  39 446 430 374 176 109 290   1 147 468 350 322 319 184 399 130
## [127] 327  80 373 288   2 116 285   6 484 464 478 213 289 270 488 231 172 349
## [145] 480 111  26 431 433 153  49  12 201  33 249 465  89 412 371 424 400  82
## [163] 489 237 195 447 163 490 155  73 146 221 428  64 491  22 502 258  40 472
## [181] 317   7 126 218 127 115 307   4 157  70  58  53 344 308 134 419 470  23
## [199]  45  98  96 411  85 141  20 358 212 180  79 328 498 414 341 392 314 128
## [217] 471 363 396 466  19 316 504 395 318  27 268 265 294 222 401  95  93 129
## [235]  51 449 346 364 365 418 168  38 140 439 361 415 286 403 367 149 159 132
## [253]  68  91 458 497 215 339  10 272 390 347 348 426 474 156 260 393 467 199
## [271] 413 276 253 378 192 456 112 214  67 329 383 343 254 102 391  56 416 354
## [289] 233 118 499 333 165 324 220  42 479 453 302 293 377 175 404 362 120 435
## [307]  76 179 169 386 191 485 380 275 235 239 207 398 463 323 205 379 460 121
## [325] 219  94  52 434 236 486 217 197  75 369 137 337 228 150 496 125 444 194
## [343] 299 161 203 100 356 420 162 342 475 461 451 181  46 182 297  66  63 158
## [361]  11 198 123 410  83 381 187 477 501 117 143  43 238 394 440  18 459 151
## [379] 417 436 375 256 185 427 178 448 211  88 189 284 312 352 267 283 264  62
## [397]  86 455 305 331 190 409 200 296
# create train set
train <- boston_scaled[ind,]
# create test set 
test <- boston_scaled[-ind,]

#Fitting the Linear Discriminant Analysis# First the linear discriminant analysis (LDA) is fitted to the train set. The new categorical variable crime is the target variable and all the other variables of the dataset are predictor variables. After fitting we draw the LDA biplot with arrows

#Fit the linear discriminant analysis on the train set. Use the categorical crime rate as the target variable and all the other variables in the dataset as predictor variables. Draw the LDA (bi)plot.
#LDA = linear discriminant analysis
lda.fit <- lda(crime ~. , data = train)
#print the lda.fit object
lda.fit
## Call:
## lda(crime ~ ., data = train)
## 
## Prior probabilities of groups:
##       low   med_low  med_high      high 
## 0.2599010 0.2400990 0.2376238 0.2623762 
## 
## Group means:
##                  zn      indus        chas        nox          rm        age
## low       0.9264803 -0.8736212 -0.12234430 -0.8635528  0.49388666 -0.8652180
## med_low  -0.1219005 -0.3113325  0.05238023 -0.5888293 -0.12474033 -0.3693052
## med_high -0.3782608  0.2754342  0.26081992  0.4258509  0.06895497  0.4350788
## high     -0.4872402  1.0170298 -0.04947434  1.0727255 -0.46615802  0.8131836
##                 dis        rad        tax     ptratio       black       lstat
## low       0.8275237 -0.6898324 -0.7396930 -0.47699882  0.38228789 -0.78401261
## med_low   0.3365584 -0.5651079 -0.5172469 -0.09327041  0.35411133 -0.15882754
## med_high -0.3971522 -0.4088340 -0.2755184 -0.35283425  0.04343446  0.05802115
## high     -0.8589413  1.6390172  1.5146914  0.78181164 -0.83025061  0.89771191
##                 medv
## low       0.54660615
## med_low   0.03398401
## med_high  0.12724861
## high     -0.69964339
## 
## Coefficients of linear discriminants:
##                 LD1         LD2        LD3
## zn       0.08626915  0.61966180 -0.9309014
## indus    0.02505141 -0.31812395  0.1222383
## chas    -0.09902383 -0.07043254  0.1641390
## nox      0.38773763 -0.70450110 -1.2780305
## rm      -0.09051955 -0.07172573 -0.2156378
## age      0.26335060 -0.31317794 -0.1684386
## dis     -0.06340621 -0.26109140  0.0863800
## rad      3.18178797  1.07884005  0.0609861
## tax      0.02401227 -0.14351378  0.4477009
## ptratio  0.10544947  0.04216746 -0.2020851
## black   -0.13331726  0.02937662  0.1542919
## lstat    0.26744778 -0.26912751  0.3804142
## medv     0.24853158 -0.38398346 -0.1312317
## 
## Proportion of trace:
##    LD1    LD2    LD3 
## 0.9531 0.0351 0.0118
#the function for lda biplot arrows
lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "red", tex = 0.75, choices = c(1,2)){
  heads <- coef(x)
  arrows(x0 = 0, y0 = 0, 
         x1 = myscale * heads[,choices[1]], 
         y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads)
  text(myscale * heads[,choices], labels = row.names(heads), 
       cex = tex, col=color, pos=3)
}
#target classes as numeric
classes <- as.numeric(train$crime)
classes
##   [1] 1 4 2 2 4 3 4 4 2 1 2 3 3 3 2 3 3 4 2 4 3 2 1 2 3 4 3 2 1 3 3 4 1 2 3 2 1
##  [38] 4 1 2 2 1 3 3 2 1 4 1 1 4 3 1 4 1 3 2 4 4 2 3 4 2 3 2 4 2 4 3 3 2 2 2 3 3
##  [75] 1 2 1 1 4 3 1 3 2 2 2 1 1 2 4 1 3 4 3 4 1 2 2 1 2 1 2 4 2 2 1 3 1 3 2 4 2
## [112] 4 4 4 1 2 1 1 3 4 1 2 3 2 4 3 3 2 4 1 1 2 1 1 3 4 4 2 1 2 4 3 3 1 4 2 3 4
## [149] 4 3 2 2 1 3 2 4 1 4 4 4 4 1 2 3 1 4 3 2 3 2 3 3 4 2 2 3 1 3 1 4 3 2 2 1 3
## [186] 2 1 1 3 2 1 1 1 1 3 4 4 3 2 2 2 4 1 3 3 4 3 1 1 2 3 4 1 4 3 3 4 4 4 3 3 2
## [223] 1 4 2 3 3 3 2 3 4 1 1 3 2 4 1 4 3 4 3 1 3 4 4 4 1 4 4 3 3 3 1 1 4 3 3 1 2
## [260] 2 4 1 1 4 4 3 3 4 4 1 4 2 2 4 1 4 2 2 1 1 4 1 3 2 4 1 4 1 3 2 2 1 3 3 2 2
## [297] 4 4 1 1 4 2 4 4 2 4 2 1 3 4 2 3 4 1 3 2 2 4 4 3 1 4 4 1 2 1 1 4 3 3 1 1 1
## [334] 4 3 1 3 3 2 2 4 1 1 3 1 1 2 4 3 1 4 4 4 1 2 1 1 1 2 3 2 1 2 4 1 4 1 4 2 2
## [371] 3 2 3 4 4 3 4 3 4 4 4 1 2 4 1 4 2 1 2 1 3 1 3 1 3 2 1 4 1 1 2 4 1 2
#plot the lda results
plot(lda.fit, dimen = 2, col = classes, pch = classes)
lda.arrows(lda.fit, myscale = 3)

#Predicting the classes#

#Save the crime categories from the test set and then remove the categorical crime variable from the test dataset. Then predict the classes with the LDA model on the test data. Cross tabulate the results with the crime categories from the test set. Comment on the results
#save the correct classes from test data
correct_classes <- test$crime
#remove the crime variable from test data
test <- dplyr::select(test, -crime)
#predict classes with test data
lda.pred <- predict(lda.fit, newdata = test)
#cross tabulate the results
table(correct = correct_classes, predicted = lda.pred$class)
##           predicted
## correct    low med_low med_high high
##   low       16       6        0    0
##   med_low    7      14        8    0
##   med_high   0      12       17    1
##   high       0       0        0   21

Prediction were quite good. There was some errors in the middle of the range but classes low and especially high were good. Only one correct representative of high class was predicted to med_low class.

#Reload the Boston dataset and standardize the dataset (we did not do this in the Datacamp exercises, but you should scale the variables to get comparable distances). Calculate the distances between the observations. Run k-means algorithm on the dataset. Investigate what is the optimal number of clusters and run the algorithm again. Visualize the clusters (for example with the pairs() or ggpairs() functions, where the clusters are separated with colors) and interpret the results
#Loading and scaling Boston data
scaled_Boston <- scale(Boston)
summary(scaled_Boston)
##       crim                 zn               indus              chas        
##  Min.   :-0.419367   Min.   :-0.48724   Min.   :-1.5563   Min.   :-0.2723  
##  1st Qu.:-0.410563   1st Qu.:-0.48724   1st Qu.:-0.8668   1st Qu.:-0.2723  
##  Median :-0.390280   Median :-0.48724   Median :-0.2109   Median :-0.2723  
##  Mean   : 0.000000   Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.007389   3rd Qu.: 0.04872   3rd Qu.: 1.0150   3rd Qu.:-0.2723  
##  Max.   : 9.924110   Max.   : 3.80047   Max.   : 2.4202   Max.   : 3.6648  
##       nox                rm               age               dis         
##  Min.   :-1.4644   Min.   :-3.8764   Min.   :-2.3331   Min.   :-1.2658  
##  1st Qu.:-0.9121   1st Qu.:-0.5681   1st Qu.:-0.8366   1st Qu.:-0.8049  
##  Median :-0.1441   Median :-0.1084   Median : 0.3171   Median :-0.2790  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.5981   3rd Qu.: 0.4823   3rd Qu.: 0.9059   3rd Qu.: 0.6617  
##  Max.   : 2.7296   Max.   : 3.5515   Max.   : 1.1164   Max.   : 3.9566  
##       rad               tax             ptratio            black        
##  Min.   :-0.9819   Min.   :-1.3127   Min.   :-2.7047   Min.   :-3.9033  
##  1st Qu.:-0.6373   1st Qu.:-0.7668   1st Qu.:-0.4876   1st Qu.: 0.2049  
##  Median :-0.5225   Median :-0.4642   Median : 0.2746   Median : 0.3808  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 1.6596   3rd Qu.: 1.5294   3rd Qu.: 0.8058   3rd Qu.: 0.4332  
##  Max.   : 1.6596   Max.   : 1.7964   Max.   : 1.6372   Max.   : 0.4406  
##      lstat              medv        
##  Min.   :-1.5296   Min.   :-1.9063  
##  1st Qu.:-0.7986   1st Qu.:-0.5989  
##  Median :-0.1811   Median :-0.1449  
##  Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.6024   3rd Qu.: 0.2683  
##  Max.   : 3.5453   Max.   : 2.9865
#calculating the euclidean distance matrix between the observation
dist_eu <- dist(scaled_Boston)
#determining the max number of clusters
cluster_max <- 15
#calculate the total within sum of squares
#K-means might produce different results every time, because it randomly 
#assigns the initial cluster centers. The function set.seed() can be used to 
#deal with that.
set.seed(123)
twcss <- sapply(1:cluster_max, function(k){kmeans(dist_eu, k)$tot.withinss})
# visualize the results
plot(1:cluster_max, twcss, type='b')

One way to determine the number of clusters is to look how the total of within cluster sum of squares (WCSS) behaves when the number of clusters changes. WCSS was calculated from 1 to 15 clusters. The optimal number of clusters is when the total WCSS drops radically. It seems that in this case optimal number of clusters is two. However we are here to learn so I decided to analyse model with four clusters.

After determining the number of clusters I run the K-means alcorithm again

#k-means clustering
km <-kmeans(dist_eu, centers = 4)
# plot the Boston dataset with clusters
pairs(Boston, col = km$cluster)

It seems that when the data is divided to four clusters there is some clear differences in distriputions of several variables. Crim, zn, indus and blacks are variables were one can distinguish clear patterns between clusters. Some variables (rad & tax) show that sometimes 1 or 2 clusters make a clear distripution but observation of other two clusters are ambigious and there is no clear pattern to be regognised.

#BONUS: LDA using clusters as target classes#

#Perform k-means on the original Boston data with some reasonable number of clusters (> 2). Remember to standardize the dataset. Then perform LDA using the clusters as target classes. Include all the variables in the Boston data in the LDA model. Visualize the results with a biplot (include arrows representing the relationships of the original variables to the LDA solution). Interpret the results. Which variables are the most influencial linear separators for the clusters? 
#Loading and scaling Boston data
scaled_Boston <- scale(Boston)
scaled_Boston <- as.data.frame(scaled_Boston)
#colnames(scaled_Boston)
#calculating the euclidean distance matrix between the observation
dist_eu <- dist(scaled_Boston)
#k-means clustering
set.seed(123)
km <-kmeans(dist_eu, centers = 4)
cm <- as.data.frame(km$cluster)
#adding the clusters to the scaled dataset
scaled_Boston <- data.frame(scaled_Boston, clust = cm)
colnames(scaled_Boston)[15] <- "clust"
summary(scaled_Boston)
##       crim                 zn               indus              chas        
##  Min.   :-0.419367   Min.   :-0.48724   Min.   :-1.5563   Min.   :-0.2723  
##  1st Qu.:-0.410563   1st Qu.:-0.48724   1st Qu.:-0.8668   1st Qu.:-0.2723  
##  Median :-0.390280   Median :-0.48724   Median :-0.2109   Median :-0.2723  
##  Mean   : 0.000000   Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.007389   3rd Qu.: 0.04872   3rd Qu.: 1.0150   3rd Qu.:-0.2723  
##  Max.   : 9.924110   Max.   : 3.80047   Max.   : 2.4202   Max.   : 3.6648  
##       nox                rm               age               dis         
##  Min.   :-1.4644   Min.   :-3.8764   Min.   :-2.3331   Min.   :-1.2658  
##  1st Qu.:-0.9121   1st Qu.:-0.5681   1st Qu.:-0.8366   1st Qu.:-0.8049  
##  Median :-0.1441   Median :-0.1084   Median : 0.3171   Median :-0.2790  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.5981   3rd Qu.: 0.4823   3rd Qu.: 0.9059   3rd Qu.: 0.6617  
##  Max.   : 2.7296   Max.   : 3.5515   Max.   : 1.1164   Max.   : 3.9566  
##       rad               tax             ptratio            black        
##  Min.   :-0.9819   Min.   :-1.3127   Min.   :-2.7047   Min.   :-3.9033  
##  1st Qu.:-0.6373   1st Qu.:-0.7668   1st Qu.:-0.4876   1st Qu.: 0.2049  
##  Median :-0.5225   Median :-0.4642   Median : 0.2746   Median : 0.3808  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 1.6596   3rd Qu.: 1.5294   3rd Qu.: 0.8058   3rd Qu.: 0.4332  
##  Max.   : 1.6596   Max.   : 1.7964   Max.   : 1.6372   Max.   : 0.4406  
##      lstat              medv             clust      
##  Min.   :-1.5296   Min.   :-1.9063   Min.   :1.000  
##  1st Qu.:-0.7986   1st Qu.:-0.5989   1st Qu.:2.000  
##  Median :-0.1811   Median :-0.1449   Median :3.000  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   :2.943  
##  3rd Qu.: 0.6024   3rd Qu.: 0.2683   3rd Qu.:4.000  
##  Max.   : 3.5453   Max.   : 2.9865   Max.   :4.000
#Original Boston dataset is now scaled and the result of K-means clustering is saved to the variable *clust*


#LDA = linear discriminant analysis
lda.fit.km <- lda(clust ~. , data = scaled_Boston)
#print the lda.fit object
lda.fit.km
## Call:
## lda(clust ~ ., data = scaled_Boston)
## 
## Prior probabilities of groups:
##         1         2         3         4 
## 0.1304348 0.2272727 0.2114625 0.4308300 
## 
## Group means:
##         crim         zn      indus       chas        nox         rm        age
## 1  1.4330759 -0.4872402  1.0689719  0.4435073  1.3439101 -0.7461469  0.8575386
## 2  0.2797949 -0.4872402  1.1892663 -0.2723291  0.8998296 -0.2770011  0.7716696
## 3 -0.3912182  1.2671159 -0.8754697  0.5739635 -0.7359091  0.9938426 -0.6949417
## 4 -0.3894453 -0.2173896 -0.5212959 -0.2723291 -0.5203495 -0.1157814 -0.3256000
##          dis        rad        tax     ptratio       black      lstat
## 1 -0.9620552  1.2941816  1.2970210  0.42015742 -1.65562038  1.1930953
## 2 -0.7723199  0.9006160  1.0311612  0.60093343 -0.01717546  0.6116223
## 3  0.7751031 -0.5965444 -0.6369476 -0.96586616  0.34190729 -0.8200275
## 4  0.3182404 -0.5741127 -0.6240070  0.02986213  0.34248644 -0.2813666
##          medv
## 1 -0.81904111
## 2 -0.54636549
## 3  1.11919598
## 4 -0.01314324
## 
## Coefficients of linear discriminants:
##                 LD1        LD2         LD3
## crim    -0.18113078  0.5012256  0.60535205
## zn      -0.43297497  1.0486194 -0.67406151
## indus   -1.37753200 -0.3016928 -1.07034034
## chas     0.04307937  0.7598229  0.22448239
## nox     -1.04674638  0.3861005  0.33268952
## rm       0.14912869  0.1510367 -0.67942589
## age      0.09897424 -0.0523110 -0.26285587
## dis     -0.13139210  0.1593367  0.03487882
## rad     -0.65824136 -0.5189795 -0.48145070
## tax     -0.28903561  0.5773959 -0.10350513
## ptratio -0.22236843 -0.1668597  0.09181715
## black    0.42730704 -0.5843973 -0.89869354
## lstat   -0.24320629  0.6197780  0.01119242
## medv    -0.21961575  0.9485829  0.17065360
## 
## Proportion of trace:
##    LD1    LD2    LD3 
## 0.7596 0.1768 0.0636
#the function for lda biplot arrows
lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "red", tex = 0.75, choices = c(1,2)){
  heads <- coef(x)
  arrows(x0 = 0, y0 = 0, 
         x1 = myscale * heads[,choices[1]], 
         y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads)
  text(myscale * heads[,choices], labels = row.names(heads), 
       cex = tex, col=color, pos=3)
}
#target classes as numeric
classes <- as.numeric(scaled_Boston$clust)
#classes
#plot the lda results
plot(lda.fit.km, dimen = 2, col = classes, pch = classes)
lda.arrows(lda.fit.km, myscale = 3)

#Super-bonus# 3D plot where observations color is the crime classes of the train set

model_predictors <- dplyr::select(train, -crime)
#check the dimensions
#dim(model_predictors)
#dim(lda.fit$scaling)
#matrix multiplication
matrix_product <- as.matrix(model_predictors) %*% lda.fit$scaling
matrix_product <- as.data.frame(matrix_product)
#3d plot
plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type= 'scatter3d', mode='markers', color = train$crime)
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

3D plot where observations color is based on the K-means clusters

model_predictors <- dplyr::select(scaled_Boston, -clust)
#check the dimensions
#dim(model_predictors)
#dim(lda.fit.km$scaling)
#matrix multiplication
matrix_product <- as.matrix(model_predictors) %*% lda.fit.km$scaling
matrix_product <- as.data.frame(matrix_product)
#3D plot
plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type= 'scatter3d', mode='markers', color = scaled_Boston$clust)

Colors of the both plots is based to four classes. It seems that K-means plot shows the different clusters more clearly than the plot that is based on the crime classification.